In [2]:
#import python packages for data analysis
import pandas as pd, numpy as np
import seaborn as sns
import altair as alt
import matplotlib.pyplot as plt
%matplotlib inline
import pandas_profiling
In [4]:
#url='https://github.com/aarondov/voila-heroku/blob/master/notebooks/dflowtomid.xlsx'
In [17]:
#read source file into iPython Env.
dflm=pd.read_excel('dflowtomid.xlsx')
pd.options.display.float_format ='{:,.0f}'.format
#dflm['Dollar Threshold ID'] = dflm['Dollar Threshold ID'].astype('category')
In [18]:
dflm.shape
Out[18]:
(983, 7)

The enclosed EDA report includes the following key sections:

  • descriptive statistics including ranges
  • general overview of the dataset (shape of the data)
  • histograms for each variable
  • warnings if a particular variable has a unique distribution such as uniform versus normal
  • correlations
  • outliers (extreme values)
  • missing values

To access the report please scroll down to the profile report. I also included the profile report as a separate html file.

In [4]:
pandas_profiling.ProfileReport(dflm)



Out[4]:

Python code to automate identifying missing values for each column in a dataframe.

In [5]:
missingvalues_total = dflm.isnull().sum().sort_values(ascending=False)
missingvalues_total
Out[5]:
revenues               9
agegroup               5
age                    5
payor                  4
Patient Type           0
Dollar Threshold ID    0
Dollar Thresholds      0
dtype: int64

Summarize the data with a pivot tables.

In [6]:
dflm.pivot_table(index=['Dollar Threshold ID','Dollar Thresholds'],values='revenues',margins=True,aggfunc=['sum','count'])
Out[6]:
sum count
revenues revenues
Dollar Threshold ID Dollar Thresholds
2 0-500 103,061 401
3 501-1,000 3,850 5
4 1,001-2,500 47,712 27
5 2,501-5,000 247,266 65
6 5,001-10,000 850,037 111
7 10,001-15,000 1,397,129 110
8 15,001-20,000 2,142,309 124
9 20,001-25,000 2,859,188 128
10 25,001-50,000 80,016 2
11 50,001-100,000 79,468 1
All 7,810,036 974
In [14]:
Piv_patienttype=dflm.pivot_table(index=['Patient Type'],values='revenues',margins=True,aggfunc=['sum','count'])
Piv_patienttype
Out[14]:
sum count
revenues revenues
Patient Type
ER 43,173 84
IP 4,678,884 224
OP 3,087,979 666
All 7,810,036 974
In [11]:
import altair as alt
dflm=pd.read_excel('dflowtomid.xlsx')

alt.Chart(dflm).mark_bar().encode(
    x=alt.X('Dollar Thresholds', title='Count of Dollar Thresholds', sort=alt.EncodingSortField(
        field='Dollar Thresholds',
        op='count',
        order='descending')),
    y='count(Dollar Thresholds)',
    color='agegroup:N'
)
Out[11]:
In [12]:
import altair as alt


alt.Chart(dflm).mark_bar().encode(
    x=alt.X('Patient Type', title='Revenues by Patient Type', sort=alt.EncodingSortField(
        field='revenues',
        op='sum',
        order='descending')),
    y='sum(revenues)',
    color='agegroup:N'
)
Out[12]:
In [9]:
import altair as alt
dflm=pd.read_excel('dflowtomid.xlsx')

alt.Chart(dflm).mark_bar().encode(
    x=alt.X('Patient Type', title='Revenues by Patient Type', sort=alt.EncodingSortField(
        field='revenues',
        op='count',
        order='descending')),
    y='count(revenues)',
    color='agegroup:N'
)
Out[9]:

Pivot data by aging categories.The majority of accounts are 151+.

In [15]:
Piv_agegroup=dflm.pivot_table(index=['agegroup'],values='revenues',margins=True,aggfunc=['sum','count','mean'])
Piv_agegroup
Out[15]:
sum count mean
revenues revenues revenues
agegroup
1G 0-30 373,191 44 8,482
2G 31-60 255,973 36 7,110
3G 61-90 299,379 36 8,316
4G 91-120 330,533 46 7,186
5G 121-150 225,314 34 6,627
6 G151+ 6,324,350 773 8,182
All 7,808,740 969 8,059

Create a pairs plot with seaborn.

In [17]:
#create pair plots to better understand the data
'''
This basic pair plots leverages seaborn's statistical graphing package.
'''
dflm=pd.read_excel('dflowtomid.xlsx')
sns.set(style="ticks", color_codes=True)
dflm=sns.pairplot(dflm)
#plt.show()

Leverage altair to create a scatter plot of revenues by age groups. The scatter plot clearly highlights that the majority of accounts are aged and under $20k.

In [20]:
#Leverage altair to create a scatter plot of revenues by account ageing categories.
# The scatter plot clearly highlights that the majority of accounts are aged and under $20k.

import altair as alt
dflm=pd.read_excel('dflowtomid.xlsx')
chart = alt.Chart(dflm).mark_circle().encode(
    x='age',
    y='revenues',
    color='agegroup',
)
chart
Out[20]:

The graph below is also a pair plot, but with a different graph package. These graphs are interactive. Use your cursor to move points on one of the graphs.

In [21]:
#The graph below is also a pair plot, but with a different graph package. These graphs are interactive.
#Use your cursor to move points on one of the graphs.

alt.Chart(dflm).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color='agegroup:N'
).properties(
    width=150,
    height=150
).repeat(
    row=['revenues', 'age'],
    column=['age', 'revenues']
).interactive()
Out[21]:

A strip plot is a great way to visualize all of the data points by segments.

As you can see below this graph highlights 151+ as the largest category. With this type of plot the small number of high dollar accounts are easy to see.

In [14]:
stripplot =  alt.Chart(dflm, width=80).mark_circle(size=10).encode(
    x=alt.X(
        'jitter:Q',
        title=None,
        axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
        scale=alt.Scale(),
    ),
    y=alt.Y('revenues:Q'),
    color=alt.Color('agegroup:N', legend=None),
    column=alt.Column(
        'agegroup:N',
        header=alt.Header(
            labelAngle=-90,
            titleOrient='top',
            labelOrient='bottom',
            labelAlign='right',
            labelPadding=3,
        ),
   ),
).transform_calculate(
    #Generate Gaussian jitter with a Box-Muller transform
    jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

stripplot
Out[14]:
In [15]:
dflm=pd.read_excel('dflowtomid.xlsx')

stripplot =  alt.Chart(dflm, width=80).mark_circle(size=10).encode(
    x=alt.X(
        'jitter:Q',
        title=None,
        axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
        scale=alt.Scale()
    ),
    y=alt.Y('revenues:Q'),
    color=alt.Color('Dollar Thresholds:N', legend=None),
    column=alt.Column(
        'Dollar Thresholds:N',sort=alt.EncodingSortField(
        field='Dollar Threshold ID',
        order='ascending'),
        header=alt.Header(
            labelAngle=-90,
            titleOrient='top',
            labelOrient='bottom',
            labelAlign='right',
            labelPadding=3,
        ),
   ),
).transform_calculate(
    #Generate Gaussian jitter with a Box-Muller transform
    jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
).properties(width=75)

stripplot
Out[15]:

0-500 has the highest count\ 20k-25k= largest revenue segment

In [16]:
dflm.groupby(['Dollar Threshold ID','Dollar Thresholds'])['revenues'].agg(['sum','count']).sort_values(by=['sum','count'], ascending=False)
Out[16]:
sum count
Dollar Threshold ID Dollar Thresholds
9 20,001-25,000 2,859,188 128
8 15,001-20,000 2,142,309 124
7 10,001-15,000 1,397,129 110
6 5,001-10,000 850,037 111
5 2,501-5,000 247,266 65
2 0-500 103,061 401
10 25,001-50,000 80,016 2
11 50,001-100,000 79,468 1
4 1,001-2,500 47,712 27
3 501-1,000 3,850 5
In [17]:
#Brushing Scatter Plot to show data on a table
#new df low to mid range dollars

dflm=pd.read_excel('dflowtomid.xlsx')
pd.options.display.float_format ='{:,.0f}'.format


import altair as alt
source = dflm
# Brush for selection
brush = alt.selection(type='interval')

# Scatter Plot
points = alt.Chart(source).mark_point().encode(
    x='age:Q',
    y='revenues:Q',
    color=alt.condition(brush, 'Patient Type:N', alt.value('grey'))
).add_selection(brush)

# Base chart for data tables
ranked_text = alt.Chart(source).mark_text().encode(
    y=alt.Y('row_number:O',axis=None)
).transform_window(
    row_number='row_number()'
).transform_filter(
    brush
).transform_window(
    rank='rank(row_number)'
).transform_filter(
    alt.datum.rank<20
)

# Data Tables
age = ranked_text.encode(text='age:N').properties(title='Acct Age')
revenues = ranked_text.encode(text='revenues:N').properties(title='Revenues')
payor = ranked_text.encode(text='payor:N').properties(title='Payor')
patient_type = ranked_text.encode(text='Patient Type:N').properties(title='Patiet Type')
text = alt.hconcat(age, revenues, payor, patient_type) # Combine data tables

# Build chart
alt.hconcat(
    points,
    text
).resolve_legend(
    color="independent"
)
Out[17]:
In [18]:
#new df low to mid range dollars

dflm=pd.read_excel('dflowtomid.xlsx')
pd.options.display.float_format ='{:,.0f}'.format


import altair as alt
source = dflm

alt.Chart(source).mark_circle().encode(
    alt.X('age:Q', bin=True),
    alt.Y('revenues:Q', bin=True),
    size='count()'
)
Out[18]:
In [19]:
#new df low to mid range dollars
dflm=pd.read_excel('dflowtomid.xlsx')
pd.options.display.float_format ='{:,.0f}'.format
import altair as alt
source = dflm
# Configure the options common to all layers
brush = alt.selection(type='interval')
base = alt.Chart(source).add_selection(brush)
# Configure the points
points = base.mark_point().encode(
    x=alt.X('age', title=''),
    y=alt.Y('revenues', title=''),
    color=alt.condition(brush, 'Patient Type', alt.value('grey'))
).properties(height=500)
# Configure the ticks
tick_axis = alt.Axis(labels=False, domain=False, ticks=False)
x_ticks = base.mark_tick().encode(
    alt.X('age', axis=tick_axis),
    alt.Y('Patient Type', title='', axis=tick_axis),
    color=alt.condition(brush, 'Patient Type', alt.value('lightgrey'))
)
y_ticks = base.mark_tick().encode(
    alt.X('Patient Type', title='', axis=tick_axis),
    alt.Y('revenues', axis=tick_axis),
    color=alt.condition(brush, 'Patient Type', alt.value('lightgrey'))
).properties(height=500)
# Build the chart
y_ticks | (points & x_ticks)
Out[19]:
In [20]:
import altair as alt
df_low=pd.read_excel('dflowtomid.xlsx')
source = dflm
alt.Chart(source).mark_point().encode(
    x='age:Q',
    y='revenues:Q',
    row='Patient Type:N'
).properties(height=200)
Out[20]:
In [21]:
import altair as alt
dflm=pd.read_excel('dflowtomid.xlsx')
pd.options.display.float_format ='{:,.0f}'.format

source = dflm

alt.Chart(source).mark_boxplot(size=40).encode(
    y='revenues:Q',
    x='agegroup:N',
    #color='Patient Type:N'
    
).properties(
    height=800,
    width=300)
Out[21]:
In [4]:
import altair as alt
df_low=pd.read_excel('dflowtomid.xlsx')
pd.options.display.float_format ='{:,.0f}'.format

source = dflm

alt.Chart(source).mark_boxplot(size=40).encode(
    x='revenues:Q',
    y='Patient Type:N',
    color='agegroup',
).properties(
    width=700,
    height=200)
Out[4]:
In [87]:
import matplotlib.pyplot as plt
%matplotlib inline
data=dflm

ax=sns.swarmplot(x="Patient Type", y="revenues", hue='agegroup', data=dflm, size=6)
ax.set_title('Swarm Plot: Patient Types by Age Group')
fig = plt.gcf()
fig.set_size_inches(12, 6)
In [7]:
dflm
dflm=dflm.to_csv('eda.csv')
In [12]:
#create pair plots to better understand the data
'''
This basic pair plots leverages seaborn's statistical graphing package.
'''
dflm=pd.read_csv('eda.csv')
dflm_noid=dflm.iloc[:,1:7]
sns.set(style="ticks", color_codes=True)
dflm=sns.pairplot(dflm_noid)
#plt.show()
In [14]:
dflm_noid.shape
Out[14]:
(983, 6)
In [ ]: